import pandas as pd
import numpy as np
import plotly.express as px
hawker = pd.read_csv(r'C:\Users\Rahul\Documents\Hawker Centres.csv')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
hawker.head()
| X | Y | Name | Area | Rating | Reviews | Type | description | ADDRESSBLOCKHOUSENUMBER | LATITUDE | EST_ORIGINAL_COMPLETION_DATE | STATUS | CLEANINGSTARTDATE | ADDRESSUNITNUMBER | ADDRESSFLOORNUMBER | NO_OF_FOOD_STALLS | HYPERLINK | REGION | APPROXIMATE_GFA | LONGITUDE | INFO_ON_CO_LOCATORS | NO_OF_MARKET_STALLS | AWARDED_DATE | LANDYADDRESSPOINT | CLEANINGENDDATE | PHOTOURL | ADDRESSTYPE | RNR_STATUS | ADDRESSBUILDINGNAME | HUP_COMPLETION_DATE | LANDXADDRESSPOINT | ADDRESSSTREETNAME | ADDRESSPOSTALCODE | DESCRIPTION_MYENV | IMPLEMENTATION_DATE | ADDRESS_MYENV | INC_CRC | FMEL_UPD_D | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 103.850531 | 1.306206 | Buffalo Road Blk 665 (Tekka Centre/Zhu Jiao Ma... | Rochor | 4.1 | 17826.0 | MHC | HUP Standard Upgrading | 665 | NaN | 31/10/1980 | Existing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 32059.13 | NaN | http://www.nea.gov.sg/images/default-source/Ha... | I | NaN | NaN | 30/6/2009 | 29915.58 | Buffalo Road | 210665.0 | NaN | NaN | Blk 665, Buffalo Road, Singapore 210665 | D09ADF3409AC0894 | 2.020000e+13 |
| 1 | 103.887019 | 1.320648 | Aljunied Ave 2 Blk 117 (Blk 117 Aljunied Marke... | Geylang | 3.9 | 1722.0 | MHC | HUP Standard Upgrading | 117 | NaN | 15/9/1978 | Existing | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 33656.16 | NaN | http://www.nea.gov.sg/images/default-source/Ha... | I | NaN | NaN | 30/11/2005 | 33976.33 | Aljunied Ave 2 | 380117.0 | NaN | NaN | Blk 117, Aljunied Ave 2, Singapore 380117 | AA7744C0BF961D25 | 2.020000e+13 |
| 2 | 103.846607 | 1.279218 | Amoy Street Food Centre (Telok Ayer Food Centre) | Downtown Core | 4.3 | 2542.0 | HC | HUP Standard Upgrading | NaN | NaN | 1983 | Existing | NaN | NaN | NaN | NaN | NaN | NaN | 3539.00 | NaN | NaN | NaN | NaN | 29075.00 | NaN | http://www.nea.gov.sg/images/default-source/Ha... | I | NaN | National Development Building | 17/2/2003 | 29478.87 | Telok Ayer Street | 69111.0 | NaN | NaN | National Development Building, Annex B, Telok ... | 81106E0569F14101 | 2.020000e+13 |
| 3 | 103.839563 | 1.311892 | Newton Food Centre | Newton | 4.2 | 10679.0 | HC | HUP Standard Upgrading | 500 | NaN | 1971 | Existing | NaN | NaN | NaN | NaN | NaN | NaN | 3252.75 | NaN | NaN | NaN | NaN | 32687.91 | NaN | http://www.nea.gov.sg/images/default-source/Ha... | I | NaN | NaN | 12/6/2006 | 28694.91 | Clemenceau Ave North | 229495.0 | NaN | NaN | 500, Clemenceau Ave North, Singapore 229495 | A3E978497212B9F7 | 2.020000e+13 |
| 4 | 103.888490 | 1.396914 | Anchorvale Village Hawker Centre | NaN | NaN | NaN | HC | New Centre | 339 | NaN | 2022 | Under Construction | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | hawker centre/supermarket/shops/flats | NaN | 27/4/2018 | 42089.26 | NaN | NaN | NaN | NaN | Anchorvale Village | NaN | 34139.80 | Anchorvale Road | NaN | NaN | 28/5/2018 | NaN | F50A43DB63480EAC | 2.020000e+13 |
hawker.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 122 entries, 0 to 121 Data columns (total 38 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X 122 non-null float64 1 Y 122 non-null float64 2 Name 122 non-null object 3 Area 117 non-null object 4 Rating 110 non-null float64 5 Reviews 110 non-null float64 6 Type 122 non-null object 7 description 122 non-null object 8 ADDRESSBLOCKHOUSENUMBER 114 non-null object 9 LATITUDE 0 non-null float64 10 EST_ORIGINAL_COMPLETION_DATE 122 non-null object 11 STATUS 122 non-null object 12 CLEANINGSTARTDATE 0 non-null float64 13 ADDRESSUNITNUMBER 0 non-null float64 14 ADDRESSFLOORNUMBER 0 non-null float64 15 NO_OF_FOOD_STALLS 0 non-null float64 16 HYPERLINK 0 non-null float64 17 REGION 0 non-null float64 18 APPROXIMATE_GFA 33 non-null float64 19 LONGITUDE 0 non-null float64 20 INFO_ON_CO_LOCATORS 14 non-null object 21 NO_OF_MARKET_STALLS 0 non-null float64 22 AWARDED_DATE 17 non-null object 23 LANDYADDRESSPOINT 122 non-null float64 24 CLEANINGENDDATE 0 non-null float64 25 PHOTOURL 111 non-null object 26 ADDRESSTYPE 101 non-null object 27 RNR_STATUS 0 non-null float64 28 ADDRESSBUILDINGNAME 15 non-null object 29 HUP_COMPLETION_DATE 95 non-null object 30 LANDXADDRESSPOINT 122 non-null float64 31 ADDRESSSTREETNAME 120 non-null object 32 ADDRESSPOSTALCODE 115 non-null float64 33 DESCRIPTION_MYENV 0 non-null float64 34 IMPLEMENTATION_DATE 17 non-null object 35 ADDRESS_MYENV 111 non-null object 36 INC_CRC 122 non-null object 37 FMEL_UPD_D 122 non-null float64 dtypes: float64(21), object(17) memory usage: 36.3+ KB
# Dropping columns that are more than 50% null
perc = 50
min_count = int(((100-perc)/100)*hawker.shape[0] + 1)
hawker = hawker.dropna(axis=1,
thresh=min_count)
hawker.columns = hawker.columns.str.lower()
# Rename x and y columns to longitude and latitude respectively.
# We can also drop the landaddresspoint columns, which represent additional geocoding points that we won't need
hawker = hawker.rename(columns={'x':'longitude', 'y':'latitude'})
hawker = hawker.drop(columns= ['landxaddresspoint','landyaddresspoint'])
hawker.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 122 entries, 0 to 121 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 longitude 122 non-null float64 1 latitude 122 non-null float64 2 name 122 non-null object 3 area 117 non-null object 4 rating 110 non-null float64 5 reviews 110 non-null float64 6 type 122 non-null object 7 description 122 non-null object 8 addressblockhousenumber 114 non-null object 9 est_original_completion_date 122 non-null object 10 status 122 non-null object 11 photourl 111 non-null object 12 addresstype 101 non-null object 13 hup_completion_date 95 non-null object 14 addressstreetname 120 non-null object 15 addresspostalcode 115 non-null float64 16 address_myenv 111 non-null object 17 inc_crc 122 non-null object 18 fmel_upd_d 122 non-null float64 dtypes: float64(6), object(13) memory usage: 18.2+ KB
hawker = hawker.rename(columns={'est_original_completion_date':'year_opened'})
# The year_opened column is a string instead of datetime
# Looking at the values to see where the string values are
hawker['year_opened'].value_counts().sort_index()
1/1/1969 1 1/11/1976 1 1/9/1967 2 11/10/1974 1 11/3/1969 1 11/4/1979 1 12/1/1980 1 12/10/1970 1 12/11/1975 1 12/5/1975 1 13/10/1976 1 14/9/1978 1 15/11/2014 1 15/9/1975 1 15/9/1978 1 16/1/1974 1 16/1/1985 1 16/10/2017 1 16/11/1969 1 16/11/1981 1 16/3/1979 1 17/10/1984 1 17/2/1973 1 17/3/1971 1 18/12/1981 1 18/4/1977 1 19/10/1974 1 19/7/1967 1 1935 1 1951 1 1957 1 1959 1 1964 1 1965 3 1971 2 1972 1 1973 1 1974 3 1975 3 1976 1 1978 2 1979 1 1983 1 2/1/1976 1 2/3/1970 1 2/7/2015 1 20/9/1976 1 2022 9 2024 2 21/1/1969 1 21/8/1979 1 23/3/2005 1 23/8/1976 1 24/1/1977 1 24/12/1979 1 24/5/2017 1 25/5/2000 1 26/1/1978 1 26/3/1976 1 26/3/1979 1 26/8/1976 1 27/2/1978 1 27/8/1974 1 28/1/1978 1 28/2/1974 1 28/7/2017 1 29/10/1975 1 29/11/1980 1 29/3/1973 1 3/1/1969 1 3/1/1973 1 3/3/1975 1 3/7/2009 1 3/7/2017 1 30/11/1982 1 30/4/1979 1 30/4/1983 1 30/5/1977 2 30/6/1977 1 30/6/1981 1 30/6/1982 2 30/7/1977 1 30/9/1983 1 31/1/1980 1 31/1/1999 1 31/10/1980 1 31/12/1979 1 31/3/1971 1 31/3/1984 1 31/5/1983 1 31/7/1979 1 31/7/2015 1 4/4/1972 1 5/2/1968 1 5/5/1980 1 6/8/2016 1 7/7/2000 1 7/8/1978 1 8/11/2017 1 8/5/2017 1 9/9/1968 1 Pre-War 1 Name: year_opened, dtype: int64
hawker[hawker['year_opened'] == 'Pre-War']
| longitude | latitude | name | area | rating | reviews | type | description | addressblockhousenumber | year_opened | status | photourl | addresstype | hup_completion_date | addressstreetname | addresspostalcode | address_myenv | inc_crc | fmel_upd_d | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 58 | 103.794863 | 1.310908 | Holland Village Market and Food Centre | Queenstown | 4.0 | 2293.0 | MHC | HUP Rebuilding | 1 | Pre-War | Existing | http://www.nea.gov.sg/images/default-source/Ha... | I | 12/12/2005 | Lorong Mambong | 277700.0 | 1, Lorong Mambong, Singapore 277700 | 69A5ED12F17FF1A2 | 2.020000e+13 |
Doing a quick search on Holland Village, it seems that the history of neighborhood goes back to the 1930s and 1940s, so I will tentatively estimate the year_opened to be 1935
hawker.loc[58, 'year_opened'] = 1935
# Convert date_opened from string to datetime
hawker['year_opened'] = pd.to_datetime(hawker['year_opened']).dt.year
# Clean the hawker centre names for the values that contain parentheses
# For those values, we will just keep the text within the parentheses
hawker['name_clean'] = hawker['name'].str.replace(r'[^(]*\(|\)[^)]*', '')
C:\Users\Rahul\AppData\Local\Temp/ipykernel_7404/3975665014.py:4: FutureWarning: The default value of regex will change from True to False in a future version. hawker['name_clean'] = hawker['name'].str.replace(r'[^(]*\(|\)[^)]*', '')
We will use Plotly Express to create the visuals for our EDA. For the scatterplot map we create, you will need a free Mapbox token. Details on how to attain the Mapbox token and further Mapbox Plotly documentation can be found at the link below.
# Creating a new df to use that doesn't include centres under construction
hawker_open = hawker[hawker['status'] != 'Under Construction']
# Grouping the df to get the total reviews by area and number of hawker centres by area
hawker_area = pd.DataFrame(hawker_open.groupby(['area'],as_index=False)[['reviews']].agg(['sum','count'])).reset_index()
hawker_area.columns = ['area','reviews','centres']
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
plotly.offline.init_notebook_mode()
fig = px.treemap(hawker_area, path=[px.Constant("Singapore"),'area'], values='centres', color = 'reviews',
color_continuous_scale=px.colors.sequential.Viridis[::-1],
title = 'Singapore Hawker Centres by <b>Planning Area<b>')
fig.update_traces(root_color="lightgrey")
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()
token = 'YOUR_MAPBOX_TOKEN'
fig = px.scatter_mapbox(hawker_open, lat="latitude", lon="longitude", color = 'reviews',
hover_name = 'name_clean', size = 'rating',
hover_data=["rating", "reviews", 'area'], title = '<b>Map of Singapore Hawker Centres',
zoom=10, height=400, width=750, size_max=11,
color_continuous_scale=px.colors.sequential.Viridis[::-1])
fig.update_layout(mapbox_accesstoken=token, margin={"r":0,"t":30,"l":0,"b":0}, mapbox_style="streets")
fig.update_traces(text = hawker['name_clean'],
hovertemplate='<b>%{text}</b><br><br>Rating: %{marker.size:,}<br>Reviews: %{marker.color}')
fig.show()
With the low variance in color, and based on the scale of the color legend, it looks like there isn't much variance in the number of reviews and most hawker centres have thousands of reviews. Let's look at the distribution of reviews.
# Print the median and mean to get an early sense for the skew in the data
print("Median Reviews: "+ str(round(hawker_open['reviews'].median())))
print("Mean Reviews: "+ str(round(hawker_open['reviews'].mean())))
Median Reviews: 1904 Mean Reviews: 2860
reviews_median = hawker_open['reviews'].median()
reviews_mean = hawker_open['reviews'].mean()
fig = px.histogram(hawker_open, x="reviews", marginal="box", hover_data = ['name_clean'],
title = 'Distribution of the Number of Reviews',
labels = {'reviews': '# of Reviews',}).update_layout(
yaxis_title='# of Hawker Centres')
fig.add_vline(x=reviews_median, annotation_text ='Median = 1904', row=1,
annotation_position='top left')
fig.add_vline(x=reviews_mean, line_color = 'red', annotation_text= 'Mean = 2860', row = 1,
annotation_position = 'top right')
fig.show()
The data is fairly right skewed, as half of the hawker centres have more than 1,900 reviews, with a few outliers having well over 10,000 reviews.
# Print the median and mean to get an early sense for any skew in the data
print("Median Rating: "+ str(round(hawker_open['rating'].median(),1)))
print("Mean Rating: "+ str(round(hawker_open['rating'].mean(),1)))
Median Rating: 4.1 Mean Rating: 4.1
rating_mean = hawker_open['rating'].mean()
fig = px.histogram(hawker_open, x="rating", marginal="box", hover_data = ['name_clean'],
title = 'Distribution of Hawker Centre Ratings',
labels = {'rating': 'Rating',}).update_layout(
yaxis_title='# of Hawker Centres')
fig.update_xaxes(range=[0,5])
fig.add_vline(x=rating_mean, line_color = 'red', annotation_text= 'Mean = 4.1', row = 1, annotation_position = 'top left')
fig.show()
Given that ratings can only range from 0 to 5, it's not surprising to see a normal distribution with a narrow range of values. The lowest rating is 3.7, while the top rating is 4.4, which shows that you really can't go too wrong with any hawker centre.
fig = px.histogram(hawker, x='year_opened', text_auto = True,
labels = {'year_opened': 'Year Opened'},
title = 'Distribution of Hawker Centres by <b>Year Opened').update_layout(
yaxis_title= '# of Hawker Centres')
fig.show()
Singapore became an independent country after officially separating from Malaysia in 1965. Let's create a column indicating whether the hawker centre was opened before or after independence.
hawker['pre-independence'] = hawker['year_opened'] <= 1965
hawker['pre-independence'].value_counts(normalize = True).round(2)*100
False 93.0 True 7.0 Name: pre-independence, dtype: float64
93% of Singapore's current existing hawker centres were built post-independence, with almost half of them being built within 15 years of independence.
In Singapore, there are two types of hawker centres: the traditional hawker centre that sells cooked food, and the hawker centre market, which has stalls that sell cooked food and other market stalls that sell produce and other non-cooked foods. Produce and meats tend to be cheaper at these markets than regular supermarkets, making these market hawker centres an attractive one-stop shop.
hawker['type'].value_counts(normalize = True).round(2)*100
# MHC = Market-Hawker Centre
MHC 66.0 HC 34.0 Name: type, dtype: float64
To answer this, we can modify the histogram from above to color the bars by hawker type
fig = px.histogram(hawker, x='year_opened', text_auto = True,
labels = {'year_opened': 'Year Opened'}, color = 'type',
title = 'Distribution of Hawker Centres by <b>Year Opened & Type').update_layout(
yaxis_title='# of Hawkers')
fig.show()
Interesting to see that all but one of the centres opened since 2010 are traditional hawker centres with no market attached. I would be curious to know whether health/hygiene concerns over these markets factors into the decline in new market hawker centres being opened.